{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a9b18433",
   "metadata": {},
   "outputs": [],
   "source": [
    "files_path = '\\\\Data\\\\Word Counts'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5c640296",
   "metadata": {},
   "outputs": [],
   "source": [
    "# set random seed\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import re\n",
    "np.random.seed(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5f02ae6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7c9d625b",
   "metadata": {},
   "outputs": [],
   "source": [
    "for file_name in os.listdir(files_path):\n",
    "    if file_name.endswith('.txt'):\n",
    "        file_path = os.path.join(files_path, file_name)\n",
    "        with open(file_path, 'r', encoding='utf-8') as file:\n",
    "            try:\n",
    "                text = file.read()\n",
    "                data.append({'File': file_name, 'Text': text})\n",
    "            except UnicodeDecodeError:\n",
    "                print(f\"Error reading file: {file_name}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3fa57c18",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e20b617f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Name'] = df[\"File\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "913d7121",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Name'] = df['Name'].str[:-4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5d5a522c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Year'] = df['Name'].str[-4:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b8434a0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Name'] = df['Name'].str[:-5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "3e03973d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.rename(columns={'Name': 'State'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b337b07e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[[\"File\", \"Text\", \"State\", \"Year\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c2ac5709",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['evolution_regex'] = df['Text'].str.count(r'\\bevol\\w*', flags=re.IGNORECASE)\n",
    "df['motion_regex'] = df['Text'].str.count(r'\\bmotio\\w*', flags=re.IGNORECASE)\n",
    "pattern_matter = r'\\b(matter|mass)\\w*'\n",
    "df['matter_regex'] = df['Text'].str.count(pattern_matter, flags=re.IGNORECASE)\n",
    "df['energy_regex'] = df['Text'].str.count(r'\\benerg\\w*', flags=re.IGNORECASE)\n",
    "df['reproduction_regex'] = df['Text'].str.count(r'\\breproduc\\w*', flags=re.IGNORECASE)\n",
    "pattern_climate = r'\\b(climate\\schange)\\w*'\n",
    "df['climate_regex'] = df['Text'].str.count(pattern_climate, flags=re.IGNORECASE)\n",
    "df['pollution_regex'] = df['Text'].str.count(r'\\bpollut\\w*', flags=re.IGNORECASE)\n",
    "df['earth_regex'] = df['Text'].str.count(r'\\bearth\\w*', flags=re.IGNORECASE)\n",
    "df['tectonics_regex'] = df['Text'].str.count(r'\\btectonics\\w*', flags=re.IGNORECASE)\n",
    "df['universe_regex'] = df['Text'].str.count(r'\\bunivers\\w*', flags=re.IGNORECASE)\n",
    "pattern_all_non_evo = r'\\b(motio|matter|mass|energ|reproduc|climate\\schange|pollut|earth|tectonics|univers)\\w*' \n",
    "df['all_non_evo_regex'] = df['Text'].str.count(pattern_all_non_evo, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b3216fff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                      File                                               Text  \\\n",
      "0         Alabama 2009.txt  Alabama Course of Study\\nScience\\nJoseph B. Mo...   \n",
      "1          Alaska 2000.txt  ALASKA CONTENTSTANDARDS\\n\\nSCIENCE\\n\\nA studen...   \n",
      "2          Alaska 2009.txt  \\nSCIENCE \\n\\n\\nScience as Inquiry and Process...   \n",
      "3         Arizona 2009.txt    \\n\\nARIZONA ACADEMIC CONTENT STANDARDS \\n\\nS...   \n",
      "4        Arkansas 2000.txt  ED 469 094\\n\\nTITLEINSTITUTIONPUB DATENOTEAVAI...   \n",
      "..                     ...                                                ...   \n",
      "68     Washington 2009.txt  Science \\n\\n\\nK–10 Grade Level Expectations: \\...   \n",
      "69  West Virginia 2009.txt  WEST VIRGINIA \\nSECRETARY OF STATE \\nBETTY IRE...   \n",
      "70      Wisconsin 2000.txt  ED 420 539\\n\\nAUTHORTITLE\\n\\nINSTITUTIONISBNPU...   \n",
      "71      Wisconsin 2009.txt  ED 420 539\\n\\nAUTHORTITLE\\n\\nINSTITUTIONISBNPU...   \n",
      "72        Wyoming 2009.txt   \\n\\n \\n\\nWYOMING SCIENCE CONTENT AND PERFORMA...   \n",
      "\n",
      "            State  Year  evolution_regex  motion_regex  matter_regex  \\\n",
      "0         Alabama  2009                8            18            33   \n",
      "1          Alaska  2000                1             3             4   \n",
      "2          Alaska  2009                2             2             3   \n",
      "3         Arizona  2009               30            43            67   \n",
      "4        Arkansas  2000                4            15            27   \n",
      "..            ...   ...              ...           ...           ...   \n",
      "68     Washington  2009               17            41            76   \n",
      "69  West Virginia  2009               52            89            83   \n",
      "70      Wisconsin  2000               13             8            29   \n",
      "71      Wisconsin  2009               13             8            29   \n",
      "72        Wyoming  2009               17            12            29   \n",
      "\n",
      "    energy_regex  reproduction_regex  climate_regex  pollution_regex  \\\n",
      "0             70                  18              3                9   \n",
      "1              4                   1              0                0   \n",
      "2              4                   0              0                0   \n",
      "3             73                  13              2                5   \n",
      "4             34                  11              1                0   \n",
      "..           ...                 ...            ...              ...   \n",
      "68           171                  11              0                2   \n",
      "69           142                  19              0                0   \n",
      "70            38                  10              0                1   \n",
      "71            38                  10              0                1   \n",
      "72            25                   5              0                0   \n",
      "\n",
      "    earth_regex  tectonics_regex  universe_regex  all_non_evo_regex  \n",
      "0            79                4              29                263  \n",
      "1             5                0               2                 19  \n",
      "2             5                0               2                 16  \n",
      "3           124                4              27                358  \n",
      "4           106                3               7                204  \n",
      "..          ...              ...             ...                ...  \n",
      "68          113                2              28                444  \n",
      "69          121               14              19                487  \n",
      "70           47                0              10                143  \n",
      "71           47                0              10                143  \n",
      "72           30                1              11                113  \n",
      "\n",
      "[73 rows x 15 columns]\n"
     ]
    }
   ],
   "source": [
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "a48004ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                      File                                               Text  \\\n",
      "0         Alabama 2009.txt  Alabama Course of Study\\nScience\\nJoseph B. Mo...   \n",
      "1          Alaska 2000.txt  ALASKA CONTENTSTANDARDS\\n\\nSCIENCE\\n\\nA studen...   \n",
      "2          Alaska 2009.txt  \\nSCIENCE \\n\\n\\nScience as Inquiry and Process...   \n",
      "3         Arizona 2009.txt    \\n\\nARIZONA ACADEMIC CONTENT STANDARDS \\n\\nS...   \n",
      "4        Arkansas 2000.txt  ED 469 094\\n\\nTITLEINSTITUTIONPUB DATENOTEAVAI...   \n",
      "..                     ...                                                ...   \n",
      "68     Washington 2009.txt  Science \\n\\n\\nK–10 Grade Level Expectations: \\...   \n",
      "69  West Virginia 2009.txt  WEST VIRGINIA \\nSECRETARY OF STATE \\nBETTY IRE...   \n",
      "70      Wisconsin 2000.txt  ED 420 539\\n\\nAUTHORTITLE\\n\\nINSTITUTIONISBNPU...   \n",
      "71      Wisconsin 2009.txt  ED 420 539\\n\\nAUTHORTITLE\\n\\nINSTITUTIONISBNPU...   \n",
      "72        Wyoming 2009.txt   \\n\\n \\n\\nWYOMING SCIENCE CONTENT AND PERFORMA...   \n",
      "\n",
      "            State  Year  evolution_regex  motion_regex  matter_regex  \\\n",
      "0         Alabama  2009                8            18            33   \n",
      "1          Alaska  2000                1             3             4   \n",
      "2          Alaska  2009                2             2             3   \n",
      "3         Arizona  2009               30            43            67   \n",
      "4        Arkansas  2000                4            15            27   \n",
      "..            ...   ...              ...           ...           ...   \n",
      "68     Washington  2009               17            41            76   \n",
      "69  West Virginia  2009               52            89            83   \n",
      "70      Wisconsin  2000               13             8            29   \n",
      "71      Wisconsin  2009               13             8            29   \n",
      "72        Wyoming  2009               17            12            29   \n",
      "\n",
      "    energy_regex  reproduction_regex  climate_regex  pollution_regex  \\\n",
      "0             70                  18              3                9   \n",
      "1              4                   1              0                0   \n",
      "2              4                   0              0                0   \n",
      "3             73                  13              2                5   \n",
      "4             34                  11              1                0   \n",
      "..           ...                 ...            ...              ...   \n",
      "68           171                  11              0                2   \n",
      "69           142                  19              0                0   \n",
      "70            38                  10              0                1   \n",
      "71            38                  10              0                1   \n",
      "72            25                   5              0                0   \n",
      "\n",
      "    earth_regex  tectonics_regex  universe_regex  all_non_evo_regex  \\\n",
      "0            79                4              29                263   \n",
      "1             5                0               2                 19   \n",
      "2             5                0               2                 16   \n",
      "3           124                4              27                358   \n",
      "4           106                3               7                204   \n",
      "..          ...              ...             ...                ...   \n",
      "68          113                2              28                444   \n",
      "69          121               14              19                487   \n",
      "70           47                0              10                143   \n",
      "71           47                0              10                143   \n",
      "72           30                1              11                113   \n",
      "\n",
      "    word_count_raw  \n",
      "0            23715  \n",
      "1              698  \n",
      "2              752  \n",
      "3            20781  \n",
      "4            11624  \n",
      "..             ...  \n",
      "68           27962  \n",
      "69           46955  \n",
      "70           11795  \n",
      "71           11795  \n",
      "72           10382  \n",
      "\n",
      "[73 rows x 16 columns]\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Split the text column into words and count the number of words\n",
    "df['word_count_raw'] = df['Text'].str.split().str.len()\n",
    "\n",
    "\n",
    "# Print the DataFrame with the 'word_count' column\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "24ce5225",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop('Text', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "3d806364",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DataFrame saved as 'word_count.xlsx'\n"
     ]
    }
   ],
   "source": [
    "df.to_excel('word_count.xlsx', index=False)\n",
    "\n",
    "print(\"DataFrame saved as 'word_count.xlsx'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "9e021db0",
   "metadata": {},
   "outputs": [],
   "source": [
    "exit"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
